Analyses parameters

output_folder <- getwd()
output_format <- ".pdf"

Parsing taxonomic data

The code below downloads a compressed archive and uncompresses it:

download_and_extract <- function(url) {
  temp_dir_path <- tempdir()
  local_file_path <- file.path(temp_dir_path, basename(url))
  download.file(url = url, destfile = local_file_path, quiet = TRUE)
  unzipped_path <- R.utils::gunzip(local_file_path, overwrite = TRUE)
  unzipped_path[1]
}
v35_data <- parse_hmp_qiime(otu_file = "http://downloads.hmpdacc.org/data/HMQCP/otu_table_psn_v35.txt.gz",
                            mapping_file = "http://downloads.hmpdacc.org/data/HMQCP/v35_map_uniquebyPSN.txt.bz2")

Plot of everything

calculate_abundance <- function(data) {
  mutate_taxa(data, abundance = vapply(obs(data), function(i) sum(data$obs_data[i, colnames(data$obs_data) %in% data$mapping$sample_id]), numeric(1)))
}

v35_data <- calculate_abundance(v35_data)


plot_all <- function(data, output_name, seed = 1) {
  set.seed(seed)
  data %>%
    filter_taxa(abundance >= 100) %>%
    plot(node_size = n_obs,
         node_size_axis_label = "Number of OTUs",
         node_color = abundance,
         node_color_trans = "area",
         node_color_axis_label = "Number of reads",
         node_label = name,
         node_label_max = 100,
         overlap_avoidance = 1,
         # initial_layout = "re", layout = "da",
         output_file = file.path(output_folder, paste0(output_name, "--seed_", seed, output_format)))
}

plot_all(v35_data, "hmp--v35--all_data")

Plot body site differences

calculate_prop <- function(data, site) {
  sample_cols <- as.character(unlist(data$mapping[data$mapping$body_site == site, "sample_id"]))
  sample_cols <- sample_cols[sample_cols %in% colnames(data$obs_data)]
  obs_indexes <- obs(data)
  total_counts <- vapply(sample_cols, function(s) sum(data$obs_data[, s]), numeric(1))
  col_name <- paste0(site, "_median_prop")
  lapply(obs_indexes,
         function(i) {
           vapply(sample_cols, 
                  function(s) sum(data$obs_data[i, s]) / total_counts[s],
                  numeric(1))})
}

calculate_prop_diff <- function(data, site_1, site_2) {
  props_1 <- calculate_prop(data, site_1)
  props_2 <- calculate_prop(data, site_2)
  p_value <- mapply(function(x, y) wilcox.test(x, y)$p.value,
                    props_1, props_2)
  p_value <- p.adjust(p_value, method = "bonferroni")
  ifelse(p_value > 0.05 | is.nan(p_value), 0, vapply(props_1, median, numeric(1)) - vapply(props_2, median, numeric(1)))
}

plot_body_site_diff <- function(data, site_1, site_2, output_name, seed = 1) {
  # calculate site difference
  data$taxon_data$median_prop_diff <- calculate_prop_diff(data, site_1, site_2)
  
  # plot
  set.seed(seed)
  data %>%
    filter_taxa(abundance >= 100) %>%
    # filter_taxa(n_supertaxa <= 4) %>%
    plot(node_size_axis_label = "Number of OTUs",
         node_size = n_obs,
         node_color_axis_label = paste0(site_1, " only    Both    ", site_2, " only"),
         node_color = median_prop_diff,
         node_color_range = diverging_palette(),
         node_color_trans = "area",
         node_color_interval = max(abs(range(median_prop_diff))) * c(-1, 1),
         edge_color_interval = max(abs(range(median_prop_diff))) * c(-1, 1),
         node_label = name,
         node_label_max = 150,
         overlap_avoidance = 1,
         # initial_layout = "re", layout = "da",
         # maxiter = 20, fineiter = 20,
         # margin_size = c(0.001, 0.001),
         output_file = file.path(output_folder, paste0(output_name, "--", site_1, "_vs_", site_2, "--seed_", seed, output_format)))
}

plot_body_site_diff(v35_data, "Throat", "Saliva", "hmp--v35")

plot_body_site_diff(v35_data,  "Supragingival_plaque", "Subgingival_plaque", "hmp--v35")

plot_body_site_diff(v35_data,  "Anterior_nares", "Buccal_mucosa", "hmp--v35")

plot_body_site_diff_small <- function(data, site_1, site_2, seed = 1) {
  # calculate each site data
  data <- calculate_body_site_abundance(data, site_1)
  data <- calculate_body_site_abundance(data, site_2)
   
  # filter out taxa without reads in either site
  data <- filter_taxa(data,
                      data$taxon_data[[paste0(site_1, "_prop")]] != 0 | data$taxon_data[[paste0(site_2, "_prop")]] != 0,
                      supertaxa = TRUE)
  
  # calculate scaled site difference
  prop_1 <- data$taxon_data[[paste0(site_1, "_prop")]]
  prop_2 <- data$taxon_data[[paste0(site_2, "_prop")]]
  data$taxon_data$scaled_prop_diff <- (prop_1 - prop_2) / mean(c(prop_1, prop_2))
  
  # plot
  set.seed(seed)
  data %>%
    filter_taxa(abundance >= 100) %>%
    filter_taxa(n_supertaxa <= 4) %>%
    plot(node_size_axis_label = "Number of OTUs",
         node_size = n_obs,
         node_color_axis_label = paste0(site_1, " only    Both    ", site_2, " only"),
         node_color = scaled_prop_diff,
         node_color_range = diverging_palette(),
         node_color_trans = "area",
         node_color_interval = round(max(abs(range(scaled_prop_diff)))) * c(-1, 1),
         edge_color_interval = round(max(abs(range(scaled_prop_diff)))) * c(-1, 1),
         node_label = name,
         node_label_max = 50,
         overlap_avoidance = 1)
         # initial_layout = "re", layout = "da",
         # maxiter = 20, fineiter = 20,
         # margin_size = c(0.001, 0.001),
         # output_file = file.path(output_folder, paste0(output_name, "--", site_1, "_vs_", site_2, "--seed_", seed, output_format)))
}


body_sites <- c("Stool", "Saliva", "Tongue_dorsum", "Buccal_mucosa")

combinations <- t(combn(seq_along(body_sites), 2))

layout_matrix <- matrix(rep(NA, (length(body_sites))^2), nrow = length(body_sites))
for (index in 1:nrow(combinations)) {
  layout_matrix[combinations[index,1], combinations[index,2]] <- index
}

bs_combinations <- t(apply(combinations, MARGIN = 1, function(x) body_sites[x]))

plots <- lapply(1:nrow(bs_combinations),
                function(i) plot_body_site_diff_small(v35_data, bs_combinations[i, 1], bs_combinations[i, 2]))
do.call(gridExtra::grid.arrange, ncol = length(body_sites) - 1, nrow =  length(body_sites) - 1,